1 Load up data

kaggle <- read.csv("kaggle.csv")
str(kaggle)
## 'data.frame':    23997 obs. of  57 variables:
##  $ Compensation                        : chr  "" "" "" ">$1,000,000" ...
##  $ Age                                 : chr  "30-34" "30-34" "18-21" "30-34" ...
##  $ Gender                              : chr  "Man" "Man" "Man" "Man" ...
##  $ Country                             : chr  "India" "Algeria" "Egypt" "Egypt" ...
##  $ Student                             : chr  "No" "No" "Yes" "No" ...
##  $ Years.Programming                   : chr  "" "1-3 years" "1-3 years" "10-20 years" ...
##  $ Similar.Title                       : chr  "" "" "" "Machine Learning/ MLops Engineer" ...
##  $ Industry.of.Work                    : chr  "" "" "" "Other" ...
##  $ Company.Size                        : chr  "" "" "" "0-49 employees" ...
##  $ How.many.individuals.are.responsible: chr  "" "" "" "0" ...
##  $ Incorporate.Machine.Learning        : chr  "" "" "" "I do not know" ...
##  $ Years.Used.Machine.Learning         : chr  "" "Under 1 year" "1-2 years" "5-10 years" ...
##  $ Helpful.University                  : chr  "" "University courses" "" "" ...
##  $ Helpful.Online.Courses              : chr  "" "" "Online courses (Coursera, EdX, etc)" "Online courses (Coursera, EdX, etc)" ...
##  $ Helpful.Social.Media                : chr  "" "" "" "" ...
##  $ Helpful.Video.Platform              : chr  "" "" "Video platforms (YouTube, Twitch, etc)" "Video platforms (YouTube, Twitch, etc)" ...
##  $ Helpful.Kaggle                      : chr  "" "Kaggle (notebooks, competitions, etc)" "Kaggle (notebooks, competitions, etc)" "Kaggle (notebooks, competitions, etc)" ...
##  $ Helpful.None                        : chr  "" "" "" "" ...
##  $ Media.on.Social.Twitter             : chr  "" "" "Twitter (data science influencers)" "" ...
##  $ Media.on.Social.Email.Newsletters   : chr  "" "" "Email newsletters (Data Elixir, O'Reilly Data & AI, etc)" "" ...
##  $ Media.on.Reddit                     : chr  "" "" "" "" ...
##  $ Media.on.Kaggle                     : chr  "" "" "Kaggle (notebooks, forums, etc)" "Kaggle (notebooks, forums, etc)" ...
##  $ Media.on.Course.Forums              : chr  "" "" "" "" ...
##  $ Media.on.Youtube                    : chr  "" "" "YouTube (Kaggle YouTube, Cloud AI Adventures, etc)" "YouTube (Kaggle YouTube, Cloud AI Adventures, etc)" ...
##  $ Media.on.Podcasts                   : chr  "" "" "Podcasts (Chai Time Data Science, O’Reilly Data Show, etc)" "" ...
##  $ Media.on.Blogs                      : chr  "" "" "" "Blogs (Towards Data Science, Analytics Vidhya, etc)" ...
##  $ Media.on.Journal.Publications       : chr  "" "" "" "Journal Publications (peer-reviewed journals, conference proceedings, etc)" ...
##  $ Media.on.Slack.Communities          : chr  "" "" "" "Slack Communities (ods.ai, kagglenoobs, etc)" ...
##  $ No.Media.Sources                    : chr  "" "" "" "" ...
##  $ Data.Science.on.Coursera            : chr  "" "" "Coursera" "Coursera" ...
##  $ Data.Science.on.edX                 : chr  "" "" "edX" "" ...
##  $ Data.Science.on.Kaggle.Learn.Courses: chr  "" "" "" "" ...
##  $ Data.Science.on.DataCamp            : chr  "" "" "DataCamp" "DataCamp" ...
##  $ Data.Science.on.Fast.ai             : chr  "" "" "" "" ...
##  $ Data.Science.on.Udacity             : chr  "" "" "Udacity" "" ...
##  $ Data.Science.on.Udemy               : chr  "" "" "Udemy" "" ...
##  $ Data.Science.on.LinkedIn.Learning   : chr  "" "" "LinkedIn Learning" "" ...
##  $ Cloud.certification.programs        : chr  "" "" "" "" ...
##  $ Data.Science.University.Courses     : chr  "" "University Courses (resulting in a university degree)" "University Courses (resulting in a university degree)" "" ...
##  $ No.Data.Science.Courses             : chr  "" "" "" "" ...
##  $ Highest.Level.of.Formal.Education   : chr  "" "Master’s degree" "Bachelor’s degree" "No formal education past high school" ...
##  $ Published.Academic.Research.Papers  : chr  "" "Yes" "" "" ...
##  $ Python                              : chr  "" "" "Python" "Python" ...
##  $ R                                   : chr  "" "" "" "" ...
##  $ SQL                                 : chr  "" "" "SQL" "" ...
##  $ C                                   : chr  "" "" "C" "C" ...
##  $ C.                                  : chr  "" "" "" "" ...
##  $ C..                                 : chr  "" "" "" "C++" ...
##  $ Java                                : chr  "" "Java" "" "Java" ...
##  $ Javascript                          : chr  "" "" "" "Javascript" ...
##  $ Bash                                : chr  "" "" "" "Bash" ...
##  $ PHP                                 : chr  "" "" "" "PHP" ...
##  $ MATLAB                              : chr  "" "" "MATLAB" "MATLAB" ...
##  $ Julia                               : chr  "" "" "" "" ...
##  $ Go                                  : chr  "" "" "" "" ...
##  $ No.Programming.Languages            : chr  "" "" "" "" ...
##  $ ML.Hubs...Repositories.Used         : chr  "" "" "" " Huggingface Models " ...

2 Move columns around to make data cleaning easier

kaggle <- kaggle %>% 
  relocate('Highest.Level.of.Formal.Education', .before = 'Helpful.University') %>% 
  relocate('ML.Hubs...Repositories.Used', .before = 'Highest.Level.of.Formal.Education')

3 Clean up data

3.1 Binary clean up

#Give binary variables 1 and 0
kaggle <- kaggle %>% 
  mutate_at(vars(15:57),~ifelse(. == "", 0, 1))

kaggle <- kaggle %>% #columns 5 to binary
  mutate(Student = ifelse(Student == "Yes", 1,0))

3.2 Ordinal/Factor Clean up

#Turn our ordinal variables into factor (character currently)
kaggle <- kaggle %>%
  mutate(across(c(2:4, 6:14), as.factor))

#Turn our predictor into compensation (currently a bin variable)
kaggle$Compensation <- as.factor(kaggle$Compensation)

#Add in NA values for empty cells. Will make imputation easier
kaggle <- kaggle %>%
  mutate_at(c("Compensation"), ~na_if(., ""))

#print out our changes
str(kaggle)
## 'data.frame':    23997 obs. of  57 variables:
##  $ Compensation                        : Factor w/ 27 levels "",">$1,000,000",..: NA NA NA 2 NA NA NA 2 2 NA ...
##  $ Age                                 : Factor w/ 11 levels "18-21","22-24",..: 4 4 1 4 7 1 1 3 7 2 ...
##  $ Gender                              : Factor w/ 5 levels "Man","Nonbinary",..: 1 1 1 1 1 5 1 1 1 1 ...
##  $ Country                             : Factor w/ 58 levels "Algeria","Argentina",..: 21 1 14 14 21 21 21 48 56 35 ...
##  $ Student                             : num  0 0 1 0 1 1 1 0 0 1 ...
##  $ Years.Programming                   : Factor w/ 8 levels "","< 1 years",..: 1 3 3 4 7 3 3 2 7 6 ...
##  $ Similar.Title                       : Factor w/ 16 levels "","Currently not employed",..: 1 1 1 10 1 1 1 7 11 1 ...
##  $ Industry.of.Work                    : Factor w/ 16 levels "","Academics/Education",..: 1 1 1 14 1 1 1 5 5 1 ...
##  $ Company.Size                        : Factor w/ 6 levels "","0-49 employees",..: 1 1 1 2 1 1 1 2 2 1 ...
##  $ How.many.individuals.are.responsible: Factor w/ 8 levels "","0","14-Oct",..: 1 1 1 2 1 1 1 2 5 1 ...
##  $ Incorporate.Machine.Learning        : Factor w/ 7 levels "","I do not know",..: 1 1 1 2 1 1 1 4 3 1 ...
##  $ Years.Used.Machine.Learning         : Factor w/ 10 levels "","1-2 years",..: 1 10 2 8 9 2 10 10 9 6 ...
##  $ ML.Hubs...Repositories.Used         : Factor w/ 10 levels "","  TensorFlow Hub ",..: 1 1 1 3 1 5 1 5 1 2 ...
##  $ Highest.Level.of.Formal.Education   : Factor w/ 8 levels "","Bachelor’s degree",..: 1 5 2 6 2 5 8 4 5 2 ...
##  $ Helpful.University                  : num  0 1 0 0 1 0 0 0 0 1 ...
##  $ Helpful.Online.Courses              : num  0 0 1 1 0 1 0 1 0 1 ...
##  $ Helpful.Social.Media                : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Helpful.Video.Platform              : num  0 0 1 1 0 0 1 1 0 0 ...
##  $ Helpful.Kaggle                      : num  0 1 1 1 0 0 1 1 1 1 ...
##  $ Helpful.None                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Media.on.Social.Twitter             : num  0 0 1 0 0 0 0 1 0 0 ...
##  $ Media.on.Social.Email.Newsletters   : num  0 0 1 0 0 0 0 1 0 0 ...
##  $ Media.on.Reddit                     : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ Media.on.Kaggle                     : num  0 0 1 1 0 1 0 1 0 1 ...
##  $ Media.on.Course.Forums              : num  0 0 0 0 0 1 0 1 0 0 ...
##  $ Media.on.Youtube                    : num  0 0 1 1 0 1 1 1 0 0 ...
##  $ Media.on.Podcasts                   : num  0 0 1 0 0 0 0 1 0 0 ...
##  $ Media.on.Blogs                      : num  0 0 0 1 1 1 1 1 0 0 ...
##  $ Media.on.Journal.Publications       : num  0 0 0 1 0 0 0 1 0 0 ...
##  $ Media.on.Slack.Communities          : num  0 0 0 1 0 0 0 1 0 0 ...
##  $ No.Media.Sources                    : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ Data.Science.on.Coursera            : num  0 0 1 1 0 1 0 1 0 1 ...
##  $ Data.Science.on.edX                 : num  0 0 1 0 0 1 0 0 0 0 ...
##  $ Data.Science.on.Kaggle.Learn.Courses: num  0 0 0 0 0 0 1 1 1 1 ...
##  $ Data.Science.on.DataCamp            : num  0 0 1 1 0 1 0 0 0 1 ...
##  $ Data.Science.on.Fast.ai             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Data.Science.on.Udacity             : num  0 0 1 0 0 1 0 0 0 0 ...
##  $ Data.Science.on.Udemy               : num  0 0 1 0 0 1 0 0 0 1 ...
##  $ Data.Science.on.LinkedIn.Learning   : num  0 0 1 0 0 0 0 0 0 1 ...
##  $ Cloud.certification.programs        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Data.Science.University.Courses     : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ No.Data.Science.Courses             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Published.Academic.Research.Papers  : num  0 1 0 0 0 1 0 0 1 0 ...
##  $ Python                              : num  0 0 1 1 1 1 1 1 1 1 ...
##  $ R                                   : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ SQL                                 : num  0 0 1 0 0 1 1 1 0 1 ...
##  $ C                                   : num  0 0 1 1 0 0 0 1 1 1 ...
##  $ C.                                  : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ C..                                 : num  0 0 0 1 1 0 1 1 0 1 ...
##  $ Java                                : num  0 1 0 1 1 0 0 1 0 0 ...
##  $ Javascript                          : num  0 0 0 1 0 0 1 1 0 0 ...
##  $ Bash                                : num  0 0 0 1 0 0 0 0 0 1 ...
##  $ PHP                                 : num  0 0 0 1 0 0 0 1 0 0 ...
##  $ MATLAB                              : num  0 0 1 1 0 0 0 0 0 0 ...
##  $ Julia                               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Go                                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.Programming.Languages            : num  0 0 0 0 0 0 0 0 0 0 ...

4 Plot out data

#plot our stuff. Truthfully go this from chat gpt
create_distribution_chart <- function(variable) {
  ggplot(kaggle, aes(x = !!sym(variable))) +
    geom_histogram(binwidth = 1, fill = "lightseagreen", color = "turquoise4", alpha = 0.7, stat = 'count') +
    labs(title = paste("Distribution Chart -", variable),
         x = variable,
         y = "Frequency")}

# Get the list of variable names
variable_names <- names(kaggle)

# Create distribution charts for all variables
charts <- purrr::map(variable_names, create_distribution_chart)

# Print or display the charts (you can use other functions like ggsave to save them to files)
print(charts)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

## 
## [[37]]

## 
## [[38]]

## 
## [[39]]

## 
## [[40]]

## 
## [[41]]

## 
## [[42]]

## 
## [[43]]

## 
## [[44]]

## 
## [[45]]

## 
## [[46]]

## 
## [[47]]

## 
## [[48]]

## 
## [[49]]

## 
## [[50]]

## 
## [[51]]

## 
## [[52]]

## 
## [[53]]

## 
## [[54]]

## 
## [[55]]

## 
## [[56]]

## 
## [[57]]

we see that 66% of our data has nulls for our predictor variable. Personally I do not want to lose out on 67% of data. Will imputate

5 Compensation variable reworking

5.1 Turn compensation into a continuous variable

#turn our bins into continuous numbers. We set a range/bounds for our data, and then take a random number within that bounds and assign it to a value
kaggle <- kaggle %>% 
  mutate(across(1,~ifelse(. == "$0-999", runif(1112,250,999),
       ifelse(.== "1,000-1,999", runif(444,1250,1999),
        ifelse(.=="2,000-2,999", runif(271,2250,2999),
         ifelse(.=="3,000-3,999", runif(244,3250,3999),
           ifelse(.=="4,000-4,999", runif(234,4250,4999),
            ifelse(.=="5,000-7,499", runif(391,5000,7499),
             ifelse(.=="7,500-9,999", runif(362,7500,9999),
              ifelse(.=="10,000-14,999", runif(493,10000,14999),
                 ifelse(.=="15,000-19,999", runif(299,15000,19999),
                  ifelse(.=="20,000-24,999", runif(337,20000,24999),
                     ifelse(.=="25,000-29,999", runif(277,25000,29999),
                       ifelse(.=="30,000-39,999", runif(464,30000,39999),
                         ifelse(.=="40,000-49,999", runif(421,40000,49999),
                             ifelse(.=="50,000-59,999", runif(366,50000,59999),
                              ifelse(.=="60,000-69,999", runif(318,60000,69999),
                                 ifelse(.=="70,000-79,999", runif(289,70000,79999),
                                    ifelse(.=="80,000-89,999", runif(222,80000,89999),
                                     ifelse(.=="90,000-99,999", runif(197,90000,99999),
                                        ifelse(.=="100,000-124,999", runif(493,100000,124999),
                                          ifelse(.=="125,000-149,999", runif(269,125000,149999),
                                            ifelse(.=="150,000-199,999", runif(342,150000,199999),
                                               ifelse(.=="200,000-249,999", runif(155,200000,249999),
                                                  ifelse(.=="250,000-299,999", runif(78,250000,299999),
                                                         ifelse(.=="300,000-499,999", runif(76,300000,499999),
                                                                ifelse(.=="$500,000-999,999", runif(48,500000,999999),
                                                                       ifelse(.==">$1,000,000", runif(23,1000000,3000000),0))))))))))))))))))))))))))))

did this method to make sure variability is not reduced. If we used median or mean, we would have numerous observations of the same value, which decreases variability a lot. Variability is needed for OLS assumptions

5.2 Imputate null values

set.seed(458)

# an imputation model
impute_model <- mice(kaggle, m = 3, maxit = 5, meth = "cart", target = "Compensation")
## 
##  iter imp variable
##   1   1  Compensation
##   1   2  Compensation
##   1   3  Compensation
##   2   1  Compensation
##   2   2  Compensation
##   2   3  Compensation
##   3   1  Compensation
##   3   2  Compensation
##   3   3  Compensation
##   4   1  Compensation
##   4   2  Compensation
##   4   3  Compensation
##   5   1  Compensation
##   5   2  Compensation
##   5   3  Compensation
## Warning: Number of logged events: 15
#Generate imputed datasets
kaggle <- complete(impute_model)

This mice method uses cart, which creates a regression for each of the variables to make sure it accurately imputes. Did not wanna use k-means, as it would give less variability.

6 Export CSV out and save

write.csv(kaggle, "kaggleContinuous(1).csv")

I have kaggleContinuous.csv. Instead of overwriting that csv, I added a (1) so new data would be there

7 Create our new plots

7.1 Distribution

create_distribution_chart <- function(variable) {
  # Calculate frequencies and percentages
  data_summary <- kaggle %>%
    group_by(!!sym(variable)) %>%
    summarise(count = n()) %>%
    mutate(percentage = count / sum(count) * 100)

  ggplot(data_summary, aes(x = !!sym(variable), y = percentage)) +
    geom_bar(stat = 'identity', fill = "#1d9da5", color = "#449999", alpha = 0.7) +
    geom_text(aes(label = sprintf("%.1f%%", percentage)),
              position = position_stack(vjust = 0.5),
              size = 3) +
    labs(title = paste("Distribution Chart -", variable),
         x = variable,
         y = "Percentage") +
    scale_y_continuous(labels = scales::percent_format(scale = 1))
}

# Get the list of variable names
variable_names <- names(kaggle)

# Create distribution charts for all variables
charts <- purrr::map(variable_names, create_distribution_chart)

# Print or display the charts (you can use other functions like ggsave to save them to files)
print(charts)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

## 
## [[37]]

## 
## [[38]]

## 
## [[39]]

## 
## [[40]]

## 
## [[41]]

## 
## [[42]]

## 
## [[43]]

## 
## [[44]]

## 
## [[45]]

## 
## [[46]]

## 
## [[47]]

## 
## [[48]]

## 
## [[49]]

## 
## [[50]]

## 
## [[51]]

## 
## [[52]]

## 
## [[53]]

## 
## [[54]]

## 
## [[55]]

## 
## [[56]]

## 
## [[57]]

notice our new distributions. So much prettier.

7.2 Correlation

#select numeric values --> only those work for cor plots
kaggle1 <- kaggle %>% 
  dplyr::select_if(is.numeric)

cor <- cor(kaggle1)

corrplot(cor, method="color", col=colorRampPalette(c("gray27","white","#449999"))(100),cl.lim=c(0,1), tl.col = '#1d9da5')

8 Data modeling

8.1 initial model

lm <- lm(Compensation~., data = kaggle)

summary(lm)

vif(lm)

r^2 of .23 is not too bad. But definitely a lot missing. Based off the vif, there is a lot of inflation between our variables and their variances. I will be getting rid of Published.Academic.Research.Papers, How.many.individuals.are.responsible, Company.Size,Years.Used.Machine.Learning, Similar.Title, and Industry.of.Work. This gets rid of 6 variables.

#removes inflated variables
kaggle <- kaggle %>% 
  select(-c(Published.Academic.Research.Papers, How.many.individuals.are.responsible, Company.Size,Years.Used.Machine.Learning, Similar.Title, Industry.of.Work))
lm <- lm(Compensation~., data = kaggle)

summary(lm)

vif(lm)

r^2 dropped by 1 point, which isnt marignally large. VIF looks way better, but im keeping years programming in (VIF of 24, our highest currently). I also completely forgot to check for outliers. Compensation as a good sporatic few from 500000 to max numbers. Lets get rid of those

8.1.1 outlier removal

#filter out observations with compensation 500,000 and above
kaggle <- kaggle %>% 
  filter(Compensation <= 500000)

#want to see new model

summary(lm(Compensation~., data = kaggle))

r^2 increased to .46. A huuuge upgrade. Nice

8.2 data partition

set.seed(458)

train_indices <- createDataPartition(kaggle$Compensation, p = 0.6, list = FALSE)
trainData <- kaggle[train_indices, ]
tempData <- kaggle[-train_indices,]

validation_indices <- createDataPartition(tempData$Compensation, p = .5, list = FALSE, times = 1)
validationData <- tempData[validation_indices, ]

test_indices <- createDataPartition(validationData$Compensation, p = .5, list = FALSE, times = 1)
testData <- tempData[validation_indices,]

8.3 model creation

lmTrain <- lm(Compensation~., data = trainData)

summary(lmTrain)
lm(Compensation~., data = trainData) %>%
  tidy() %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 11609.815645 7836.2876 1.4815454 0.1384835
Age22-24 -5976.072423 1170.5283 -5.1054491 0.0000003
Age25-29 -2848.869176 1288.1800 -2.2115460 0.0270139
Age30-34 29.303920 1484.7600 0.0197365 0.9842539
Age35-39 -353.206940 1612.8983 -0.2189890 0.8266618
Age40-44 -408.153980 1724.1126 -0.2367328 0.8128675
Age45-49 -291.225984 1969.4702 -0.1478702 0.8824473
Age50-54 5639.952521 2231.8713 2.5270062 0.0115147
Age55-59 -276.017117 2555.6977 -0.1080007 0.9139967
Age60-69 3248.673435 2763.4894 1.1755693 0.2397867
Age70+ -9823.527188 4786.4835 -2.0523475 0.0401542
GenderNonbinary 7490.414653 5929.5745 1.2632297 0.2065274
GenderPrefer not to say -2040.746991 2861.7693 -0.7131067 0.4757914
GenderPrefer to self-describe -8192.943691 11548.6393 -0.7094294 0.4780697
GenderWoman -2882.686317 835.0523 -3.4521029 0.0005579
CountryArgentina 4794.410129 8313.8976 0.5766742 0.5641687
CountryAustralia 90125.117685 8788.2225 10.2552157 0.0000000
CountryBangladesh 930.738692 8177.0621 0.1138231 0.9093796
CountryBelgium 9394.209437 10188.4652 0.9220436 0.3565215
CountryBrazil 2108.908490 7653.2223 0.2755582 0.7828914
CountryCameroon 8086.397222 9738.9107 0.8303184 0.4063727
CountryCanada 48182.807259 8150.5963 5.9115684 0.0000000
CountryChile 5592.973705 8732.3594 0.6404883 0.5218656
CountryChina 6688.929161 7801.4265 0.8573982 0.3912393
CountryColombia 3171.920314 8094.6832 0.3918523 0.6951732
CountryCzech Republic 8366.889295 10267.3507 0.8149025 0.4151418
CountryEcuador -381.341373 10504.2498 -0.0363035 0.9710408
CountryEgypt 3443.889106 7947.4143 0.4333345 0.6647783
CountryEthiopia 4214.205596 9005.6529 0.4679511 0.6398267
CountryFrance 15373.827628 8134.4875 1.8899565 0.0587841
CountryGermany 20152.101226 9244.7998 2.1798310 0.0292864
CountryGhana 6935.779915 8914.2843 0.7780524 0.4365511
CountryHong Kong (S.A.R.) 41653.867854 10105.5255 4.1218903 0.0000378
CountryI do not wish to disclose my location 1810.893775 10726.0297 0.1688317 0.8659314
CountryIndia 3529.329151 7454.5444 0.4734467 0.6359018
CountryIndonesia 3749.314956 7892.1598 0.4750683 0.6347456
CountryIran, Islamic Republic of… -3195.371245 8913.8923 -0.3584709 0.7199962
CountryIreland 15762.926315 10108.0893 1.5594368 0.1189153
CountryIsrael 78581.138352 9242.9175 8.5017678 0.0000000
CountryItaly 3172.978133 8321.9617 0.3812777 0.7030029
CountryJapan 16636.166767 7787.1093 2.1363726 0.0326661
CountryKenya 2828.478527 8261.9300 0.3423508 0.7320920
CountryMalaysia 6803.101987 9401.3847 0.7236277 0.4693063
CountryMexico 3372.660760 7902.5291 0.4267825 0.6695443
CountryMorocco 8838.103330 8371.2229 1.0557721 0.2910903
CountryNepal 3723.224295 9328.4447 0.3991259 0.6898064
CountryNetherlands 16887.991175 8959.1731 1.8849944 0.0594510
CountryNigeria 2159.304887 7685.4796 0.2809590 0.7787459
CountryOther 8533.257298 7556.1862 1.1293074 0.2587872
CountryPakistan 1962.592147 7699.1557 0.2549100 0.7987963
CountryPeru -6041.812885 8912.7206 -0.6778865 0.4978547
CountryPhilippines 2505.910945 9098.1313 0.2754314 0.7829888
CountryPoland 4794.169940 8806.8272 0.5443697 0.5861956
CountryPortugal -6737.836549 9413.5034 -0.7157629 0.4741495
CountryRomania -8293.129624 10804.0269 -0.7675962 0.4427399
CountryRussia 5328.528160 7970.2704 0.6685505 0.5037931
CountrySaudi Arabia 23078.299365 9178.6877 2.5143354 0.0119367
CountrySingapore 25752.739961 9812.5392 2.6244726 0.0086876
CountrySouth Africa 6407.598962 9185.3191 0.6975913 0.4854442
CountrySouth Korea 11465.095905 7970.0884 1.4385155 0.1503099
CountrySpain 9180.253781 8073.0033 1.1371547 0.2554928
CountrySri Lanka -10566.351955 9403.3159 -1.1236836 0.2611663
CountryTaiwan 6174.908675 8190.3561 0.7539243 0.4509072
CountryThailand 6239.169456 8672.7639 0.7193981 0.4719075
CountryTunisia 10074.488964 8712.5225 1.1563229 0.2475686
CountryTurkey 2960.712011 7934.9800 0.3731215 0.7090636
CountryUkraine 1003.228565 9518.3962 0.1053989 0.9160608
CountryUnited Arab Emirates 19958.040702 9099.9444 2.1932047 0.0283088
CountryUnited Kingdom of Great Britain and Northern Ireland 34086.423472 8047.4428 4.2356839 0.0000229
CountryUnited States of America 91355.731663 7515.4444 12.1557325 0.0000000
CountryViet Nam -170.696465 8273.8481 -0.0206308 0.9835404
CountryZimbabwe -1677.418088 11282.6020 -0.1486730 0.8818138
Student 1006.985503 1172.6925 0.8586953 0.3905232
Years.Programming< 1 years 1666.976861 4284.8915 0.3890360 0.6972554
Years.Programming1-3 years 3189.779807 4294.8075 0.7427061 0.4576719
Years.Programming10-20 years 28979.885268 4477.3392 6.4725687 0.0000000
Years.Programming20+ years 42121.244728 4592.0178 9.1727094 0.0000000
Years.Programming3-5 years 6312.259366 4348.8892 1.4514648 0.1466726
Years.Programming5-10 years 12695.918584 4399.3700 2.8858492 0.0039095
Years.ProgrammingI have never written code 2127.701987 4264.5792 0.4989243 0.6178405
Incorporate.Machine.LearningI do not know -8192.178478 1703.6691 -4.8085503 0.0000015
Incorporate.Machine.LearningNo (we do not use ML methods) -6378.255853 1525.2370 -4.1818129 0.0000291
Incorporate.Machine.LearningWe are exploring ML methods (and may one day put a model into production) -458.942288 1689.4868 -0.2716460 0.7858982
Incorporate.Machine.LearningWe have well established ML methods (i.e., models in production for more than 2 years) 26170.382216 1661.7037 15.7491267 0.0000000
Incorporate.Machine.LearningWe recently started using ML methods (i.e., models in production for less than 2 years) 11751.409124 1827.2742 6.4311142 0.0000000
Incorporate.Machine.LearningWe use ML methods for generating insights (but do not put working models into production) 2854.386769 1988.7993 1.4352312 0.1512432
ML.Hubs…Repositories.Used TensorFlow Hub -559.001995 1886.8538 -0.2962614 0.7670348
ML.Hubs…Repositories.Used Huggingface Models 6444.232908 2400.9173 2.6840712 0.0072816
ML.Hubs…Repositories.Used Jumpstart 186.448625 10323.9098 0.0180599 0.9855913
ML.Hubs…Repositories.Used Kaggle datasets -148.637713 1410.2338 -0.1053993 0.9160604
ML.Hubs…Repositories.Used NVIDIA NGC models 1020.039924 6199.3380 0.1645401 0.8693083
ML.Hubs…Repositories.Used ONNX models 2125.877385 8365.1746 0.2541343 0.7993955
ML.Hubs…Repositories.Used PyTorch Hub 2232.127475 2673.4229 0.8349324 0.4037698
ML.Hubs…Repositories.Used Timm 2424.112325 4644.6992 0.5219094 0.6017415
ML.Hubs…Repositories.UsedOther storage services (i.e. google drive) -1685.574295 5189.2098 -0.3248229 0.7453199
Highest.Level.of.Formal.EducationBachelor’s degree -3643.970189 4605.9387 -0.7911460 0.4288720
Highest.Level.of.Formal.EducationDoctoral degree -1309.527980 4670.0898 -0.2804075 0.7791690
Highest.Level.of.Formal.EducationI prefer not to answer -4695.194450 4741.5888 -0.9902154 0.3220857
Highest.Level.of.Formal.EducationMaster’s degree -1270.036986 4579.8685 -0.2773086 0.7815472
Highest.Level.of.Formal.EducationNo formal education past high school 504.415958 5055.6180 0.0997734 0.9205257
Highest.Level.of.Formal.EducationProfessional doctorate -3307.094253 4971.1934 -0.6652516 0.5059004
Highest.Level.of.Formal.EducationSome college/university study without earning a bachelor’s degree -2885.111336 4731.1667 -0.6098097 0.5419976
Helpful.University 448.256012 874.0432 0.5128534 0.6080619
Helpful.Online.Courses -1550.934856 784.7419 -1.9763630 0.0481330
Helpful.Social.Media 1097.837710 1020.1660 1.0761363 0.2818846
Helpful.Video.Platform -1173.516744 737.8836 -1.5903820 0.1117710
Helpful.Kaggle -2155.451199 776.4901 -2.7758901 0.0055123
Helpful.None -1527.072611 1961.0898 -0.7786857 0.4361778
Media.on.Social.Twitter 1132.725430 994.7634 1.1386883 0.2548524
Media.on.Social.Email.Newsletters 922.138724 982.6021 0.9384661 0.3480209
Media.on.Reddit 1106.268754 1136.5545 0.9733530 0.3303944
Media.on.Kaggle 601.175857 801.2258 0.7503201 0.4530744
Media.on.Course.Forums -2896.466420 968.5463 -2.9905296 0.0027897
Media.on.Youtube -1440.760950 782.8805 -1.8403332 0.0657402
Media.on.Podcasts 2209.887974 1243.4238 1.7772604 0.0755468
Media.on.Blogs 1544.822954 813.3659 1.8992965 0.0575457
Media.on.Journal.Publications 972.504372 1013.4822 0.9595673 0.3372893
Media.on.Slack.Communities -946.337671 1367.0703 -0.6922377 0.4887994
No.Media.Sources -3801.858218 1604.7920 -2.3690661 0.0178464
Data.Science.on.Coursera 1471.069736 807.0474 1.8227797 0.0683578
Data.Science.on.edX 1904.456670 1188.2840 1.6026949 0.1090243
Data.Science.on.Kaggle.Learn.Courses -1185.328712 835.8581 -1.4180981 0.1561841
Data.Science.on.DataCamp 477.655048 1003.3527 0.4760590 0.6340397
Data.Science.on.Fast.ai 5999.877664 1847.8215 3.2470006 0.0011690
Data.Science.on.Udacity 762.043662 1275.4122 0.5974881 0.5501911
Data.Science.on.Udemy -773.137435 840.2486 -0.9201294 0.3575208
Data.Science.on.LinkedIn.Learning -181.414377 1110.3069 -0.1633912 0.8702128
Cloud.certification.programs 4.381781 1337.4218 0.0032763 0.9973859
Data.Science.University.Courses 4541.194118 894.6426 5.0759870 0.0000004
No.Data.Science.Courses 228.616270 1279.2315 0.1787138 0.8581650
Python 978.681861 1197.3490 0.8173739 0.4137285
R -723.881913 916.1568 -0.7901288 0.4294657
SQL 890.166373 775.2301 1.1482609 0.2508802
C -543.511535 1067.7065 -0.5090458 0.6107280
C. -262.027092 1494.8646 -0.1752848 0.8608583
C.. -866.829316 994.6779 -0.8714674 0.3835137
Java -1870.630712 1005.8559 -1.8597402 0.0629430
Javascript -2497.689869 1080.9607 -2.3106205 0.0208681
Bash 3499.993194 1422.3391 2.4607304 0.0138773
PHP -2167.406105 1586.7687 -1.3659244 0.1719843
MATLAB -2417.058235 1169.6502 -2.0664796 0.0388013
Julia -3628.959547 3052.7389 -1.1887553 0.2345559
Go 8078.002392 2938.8765 2.7486702 0.0059913
No.Programming.Languages -3198.954783 3427.0949 -0.9334305 0.3506136
lmVal <- lm(Compensation~., data = validationData)

summary(lmVal)
lm(Compensation~., data = validationData) %>%
  tidy() %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 8897.9640 9948.477 0.8944046 0.3711519
Age22-24 -6641.7633 2071.417 -3.2063866 0.0013533
Age25-29 -4096.2519 2260.085 -1.8124325 0.0699840
Age30-34 -3537.2873 2651.804 -1.3339175 0.1822964
Age35-39 1898.7355 2907.310 0.6530900 0.5137306
Age40-44 1161.3335 3053.306 0.3803527 0.7037010
Age45-49 1692.6353 3488.156 0.4852522 0.6275204
Age50-54 10773.9332 3888.750 2.7705393 0.0056186
Age55-59 4081.6783 4476.425 0.9118165 0.3619127
Age60-69 -1471.1032 4907.645 -0.2997574 0.7643756
Age70+ -40897.3302 10008.430 -4.0862883 0.0000446
GenderNonbinary -5557.7545 11477.723 -0.4842210 0.6282519
GenderPrefer not to say 680.6658 5013.120 0.1357769 0.8920036
GenderPrefer to self-describe 5020.2854 15576.711 0.3222943 0.7472443
GenderWoman -3938.2537 1475.383 -2.6693101 0.0076272
CountryArgentina 774.9395 11041.271 0.0701857 0.9440489
CountryAustralia 82905.3352 11681.904 7.0969025 0.0000000
CountryBangladesh 9487.2623 10524.251 0.9014667 0.3673870
CountryBelgium 8475.7239 15323.063 0.5531351 0.5801976
CountryBrazil 5382.2411 9688.006 0.5555572 0.5785404
CountryCameroon 16453.7945 13638.698 1.2064051 0.2277229
CountryCanada 50711.7904 10701.047 4.7389559 0.0000022
CountryChile 4120.2278 12338.383 0.3339358 0.7384431
CountryChina 6253.3730 10126.417 0.6175307 0.5369151
CountryColombia 10785.0263 10935.932 0.9862009 0.3240860
CountryCzech Republic 8461.5124 17884.744 0.4731134 0.6361545
CountryEcuador -5848.6259 15737.762 -0.3716301 0.7101853
CountryEgypt 3332.1349 10091.961 0.3301772 0.7412810
CountryEthiopia 9055.8706 13429.313 0.6743361 0.5001313
CountryFrance 22578.1563 10612.860 2.1274338 0.0334367
CountryGermany 14268.9710 12502.019 1.1413333 0.2537902
CountryGhana 2344.5582 12215.172 0.1919382 0.8477990
CountryHong Kong (S.A.R.) 13041.5639 14186.021 0.9193250 0.3579734
CountryI do not wish to disclose my location 10732.3165 17004.586 0.6311425 0.5279785
CountryIndia 6586.2932 9172.129 0.7180768 0.4727461
CountryIndonesia 10870.0604 10289.728 1.0563992 0.2908409
CountryIran, Islamic Republic of… 15155.0025 12423.579 1.2198580 0.2225807
CountryIreland 71585.1765 15306.774 4.6766992 0.0000030
CountryIsrael 84411.9283 13467.861 6.2676564 0.0000000
CountryItaly 15009.6698 11690.808 1.2838864 0.1992458
CountryJapan 22445.9919 9912.273 2.2644647 0.0235915
CountryKenya 4312.6214 11135.139 0.3872984 0.6985531
CountryMalaysia 341.4790 15762.887 0.0216635 0.9827173
CountryMexico 8518.8282 10240.282 0.8318939 0.4055117
CountryMorocco 10769.8410 11493.216 0.9370607 0.3487761
CountryNepal 12159.2154 14825.492 0.8201559 0.4121694
CountryNetherlands 32911.9203 12615.158 2.6089185 0.0091120
CountryNigeria 11215.1186 9787.403 1.1458728 0.2519069
CountryOther 8877.0562 9410.733 0.9432906 0.3455814
CountryPakistan 4144.5560 9878.063 0.4195717 0.6748178
CountryPeru 8525.1296 11976.434 0.7118254 0.4766087
CountryPhilippines 2886.8810 12104.594 0.2384947 0.8115080
CountryPoland 20944.6120 12369.502 1.6932461 0.0904758
CountryPortugal 6404.8958 15838.306 0.4043927 0.6859426
CountryRomania 5931.5476 14541.765 0.4078974 0.6833679
CountryRussia 12386.0034 10259.910 1.2072234 0.2274077
CountrySaudi Arabia 20884.2801 14184.278 1.4723541 0.1409931
CountrySingapore 33555.9692 12901.838 2.6008673 0.0093284
CountrySouth Africa 7916.3527 11419.120 0.6932542 0.4881848
CountrySouth Korea 9116.4124 10497.266 0.8684559 0.3851897
CountrySpain 23098.4818 11100.611 2.0808298 0.0375042
CountrySri Lanka -12694.9592 16294.252 -0.7791066 0.4359567
CountryTaiwan 10427.4800 10693.585 0.9751154 0.3295539
CountryThailand 1394.5859 12229.749 0.1140323 0.9092172
CountryTunisia 7549.6580 13015.713 0.5800418 0.5619146
CountryTurkey 1616.1771 10331.055 0.1564387 0.8756940
CountryUkraine 1237.6520 13679.324 0.0904761 0.9279128
CountryUnited Arab Emirates 23132.3573 12760.481 1.8128123 0.0699254
CountryUnited Kingdom of Great Britain and Northern Ireland 51562.5763 11518.027 4.4766849 0.0000078
CountryUnited States of America 97751.2547 9303.987 10.5063830 0.0000000
CountryViet Nam -352.9408 11003.137 -0.0320764 0.9744125
CountryZimbabwe 511.2355 13919.532 0.0367279 0.9707035
Student 750.5338 2084.419 0.3600686 0.7188122
Years.Programming< 1 years -1719.6036 8647.572 -0.1988539 0.8423857
Years.Programming1-3 years 722.9183 8652.776 0.0835476 0.9334198
Years.Programming10-20 years 27996.9691 8893.703 3.1479541 0.0016546
Years.Programming20+ years 33746.7610 9112.792 3.7032298 0.0002153
Years.Programming3-5 years 4905.9652 8744.225 0.5610520 0.5747892
Years.Programming5-10 years 8447.6049 8785.765 0.9615105 0.3363457
Years.ProgrammingI have never written code -248.1520 8604.153 -0.0288410 0.9769927
Incorporate.Machine.LearningI do not know -3960.7968 2959.601 -1.3382872 0.1808684
Incorporate.Machine.LearningNo (we do not use ML methods) -5077.9425 2691.129 -1.8869188 0.0592336
Incorporate.Machine.LearningWe are exploring ML methods (and may one day put a model into production) 2358.5856 2851.009 0.8272811 0.4081203
Incorporate.Machine.LearningWe have well established ML methods (i.e., models in production for more than 2 years) 29841.0260 2994.593 9.9649682 0.0000000
Incorporate.Machine.LearningWe recently started using ML methods (i.e., models in production for less than 2 years) 6655.1265 3197.036 2.0816552 0.0374287
Incorporate.Machine.LearningWe use ML methods for generating insights (but do not put working models into production) 3676.2384 3521.212 1.0440264 0.2965276
ML.Hubs…Repositories.Used TensorFlow Hub 5550.5443 3420.097 1.6229199 0.1046745
ML.Hubs…Repositories.Used Huggingface Models 11069.2004 4461.641 2.4809709 0.0131377
ML.Hubs…Repositories.Used Jumpstart -14603.3567 28787.769 -0.5072764 0.6119850
ML.Hubs…Repositories.Used Kaggle datasets -949.7629 2512.312 -0.3780434 0.7054156
ML.Hubs…Repositories.Used NVIDIA NGC models -1940.7303 9985.857 -0.1943479 0.8459120
ML.Hubs…Repositories.Used ONNX models 6766.3049 12398.032 0.5457564 0.5852597
ML.Hubs…Repositories.Used PyTorch Hub -3242.6296 4623.816 -0.7012887 0.4831581
ML.Hubs…Repositories.Used Timm 11077.5492 8159.729 1.3575879 0.1746605
ML.Hubs…Repositories.UsedOther storage services (i.e. google drive) 2981.9750 9036.644 0.3299870 0.7414247
Highest.Level.of.Formal.EducationBachelor’s degree -704.0327 9083.008 -0.0775110 0.9382204
Highest.Level.of.Formal.EducationDoctoral degree -1337.4739 9255.672 -0.1445032 0.8851094
Highest.Level.of.Formal.EducationI prefer not to answer 1896.7834 9359.414 0.2026605 0.8394093
Highest.Level.of.Formal.EducationMaster’s degree -2507.5674 9042.283 -0.2773158 0.7815500
Highest.Level.of.Formal.EducationNo formal education past high school 1095.0893 9723.973 0.1126175 0.9103387
Highest.Level.of.Formal.EducationProfessional doctorate 1815.7385 9750.094 0.1862278 0.8522743
Highest.Level.of.Formal.EducationSome college/university study without earning a bachelor’s degree -7060.9140 9367.752 -0.7537469 0.4510395
Helpful.University -981.1163 1526.991 -0.6425161 0.5205699
Helpful.Online.Courses 177.7580 1393.003 0.1276077 0.8984650
Helpful.Social.Media 1782.5842 1849.385 0.9638796 0.3351565
Helpful.Video.Platform -1476.7613 1305.861 -1.1308714 0.2581677
Helpful.Kaggle -1928.5795 1383.019 -1.3944711 0.1632422
Helpful.None 786.6436 3426.703 0.2295628 0.8184416
Media.on.Social.Twitter 3385.9781 1752.493 1.9320927 0.0534089
Media.on.Social.Email.Newsletters 232.0917 1737.398 0.1335858 0.8937359
Media.on.Reddit 196.0316 2042.547 0.0959741 0.9235453
Media.on.Kaggle 1302.6609 1438.275 0.9057109 0.3651360
Media.on.Course.Forums -2217.9694 1723.620 -1.2868086 0.1982252
Media.on.Youtube -2567.4087 1381.660 -1.8582064 0.0632030
Media.on.Podcasts 2010.0364 2227.739 0.9022763 0.3669570
Media.on.Blogs 333.7197 1431.588 0.2331116 0.8156850
Media.on.Journal.Publications -399.4962 1823.090 -0.2191313 0.8265574
Media.on.Slack.Communities -1167.1066 2352.511 -0.4961110 0.6198396
No.Media.Sources -3509.0622 2905.592 -1.2076926 0.2272271
Data.Science.on.Coursera -456.0458 1435.822 -0.3176200 0.7507875
Data.Science.on.edX 768.2341 2131.088 0.3604892 0.7184977
Data.Science.on.Kaggle.Learn.Courses -1021.5180 1482.574 -0.6890166 0.4908473
Data.Science.on.DataCamp -1243.9259 1812.741 -0.6862126 0.4926134
Data.Science.on.Fast.ai 923.0810 3275.566 0.2818081 0.7781033
Data.Science.on.Udacity 1051.6840 2231.709 0.4712461 0.6374872
Data.Science.on.Udemy -882.2995 1457.959 -0.6051607 0.5451019
Data.Science.on.LinkedIn.Learning -113.6682 1993.927 -0.0570072 0.9545419
Cloud.certification.programs 1597.7881 2311.804 0.6911435 0.4895099
Data.Science.University.Courses 3510.0109 1588.833 2.2091751 0.0272112
No.Data.Science.Courses 123.2877 2204.012 0.0559379 0.9553937
Python 1857.2647 2117.445 0.8771254 0.3804639
R 706.2501 1637.289 0.4313535 0.6662314
SQL 989.1151 1371.301 0.7212968 0.4707633
C 1512.9581 1874.749 0.8070190 0.4196969
C. -1880.7577 2539.441 -0.7406188 0.4589621
C.. 2090.7774 1779.804 1.1747233 0.2401657
Java -1298.6844 1761.031 -0.7374567 0.4608820
Javascript -1436.8006 1886.210 -0.7617393 0.4462543
Bash 2284.5738 2515.254 0.9082874 0.3637736
PHP -1125.5821 2720.452 -0.4137482 0.6790777
MATLAB 1830.8836 2055.728 0.8906254 0.3731764
Julia 1779.4135 6003.624 0.2963899 0.7669456
Go 6962.7998 5258.782 1.3240329 0.1855574
No.Programming.Languages -3075.5039 6153.472 -0.4997997 0.6172398
lmTest <- lm(Compensation~., data = testData)

summary(lmTest)
lm(Compensation~., data = testData) %>%
  tidy() %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 8897.9640 9948.477 0.8944046 0.3711519
Age22-24 -6641.7633 2071.417 -3.2063866 0.0013533
Age25-29 -4096.2519 2260.085 -1.8124325 0.0699840
Age30-34 -3537.2873 2651.804 -1.3339175 0.1822964
Age35-39 1898.7355 2907.310 0.6530900 0.5137306
Age40-44 1161.3335 3053.306 0.3803527 0.7037010
Age45-49 1692.6353 3488.156 0.4852522 0.6275204
Age50-54 10773.9332 3888.750 2.7705393 0.0056186
Age55-59 4081.6783 4476.425 0.9118165 0.3619127
Age60-69 -1471.1032 4907.645 -0.2997574 0.7643756
Age70+ -40897.3302 10008.430 -4.0862883 0.0000446
GenderNonbinary -5557.7545 11477.723 -0.4842210 0.6282519
GenderPrefer not to say 680.6658 5013.120 0.1357769 0.8920036
GenderPrefer to self-describe 5020.2854 15576.711 0.3222943 0.7472443
GenderWoman -3938.2537 1475.383 -2.6693101 0.0076272
CountryArgentina 774.9395 11041.271 0.0701857 0.9440489
CountryAustralia 82905.3352 11681.904 7.0969025 0.0000000
CountryBangladesh 9487.2623 10524.251 0.9014667 0.3673870
CountryBelgium 8475.7239 15323.063 0.5531351 0.5801976
CountryBrazil 5382.2411 9688.006 0.5555572 0.5785404
CountryCameroon 16453.7945 13638.698 1.2064051 0.2277229
CountryCanada 50711.7904 10701.047 4.7389559 0.0000022
CountryChile 4120.2278 12338.383 0.3339358 0.7384431
CountryChina 6253.3730 10126.417 0.6175307 0.5369151
CountryColombia 10785.0263 10935.932 0.9862009 0.3240860
CountryCzech Republic 8461.5124 17884.744 0.4731134 0.6361545
CountryEcuador -5848.6259 15737.762 -0.3716301 0.7101853
CountryEgypt 3332.1349 10091.961 0.3301772 0.7412810
CountryEthiopia 9055.8706 13429.313 0.6743361 0.5001313
CountryFrance 22578.1563 10612.860 2.1274338 0.0334367
CountryGermany 14268.9710 12502.019 1.1413333 0.2537902
CountryGhana 2344.5582 12215.172 0.1919382 0.8477990
CountryHong Kong (S.A.R.) 13041.5639 14186.021 0.9193250 0.3579734
CountryI do not wish to disclose my location 10732.3165 17004.586 0.6311425 0.5279785
CountryIndia 6586.2932 9172.129 0.7180768 0.4727461
CountryIndonesia 10870.0604 10289.728 1.0563992 0.2908409
CountryIran, Islamic Republic of… 15155.0025 12423.579 1.2198580 0.2225807
CountryIreland 71585.1765 15306.774 4.6766992 0.0000030
CountryIsrael 84411.9283 13467.861 6.2676564 0.0000000
CountryItaly 15009.6698 11690.808 1.2838864 0.1992458
CountryJapan 22445.9919 9912.273 2.2644647 0.0235915
CountryKenya 4312.6214 11135.139 0.3872984 0.6985531
CountryMalaysia 341.4790 15762.887 0.0216635 0.9827173
CountryMexico 8518.8282 10240.282 0.8318939 0.4055117
CountryMorocco 10769.8410 11493.216 0.9370607 0.3487761
CountryNepal 12159.2154 14825.492 0.8201559 0.4121694
CountryNetherlands 32911.9203 12615.158 2.6089185 0.0091120
CountryNigeria 11215.1186 9787.403 1.1458728 0.2519069
CountryOther 8877.0562 9410.733 0.9432906 0.3455814
CountryPakistan 4144.5560 9878.063 0.4195717 0.6748178
CountryPeru 8525.1296 11976.434 0.7118254 0.4766087
CountryPhilippines 2886.8810 12104.594 0.2384947 0.8115080
CountryPoland 20944.6120 12369.502 1.6932461 0.0904758
CountryPortugal 6404.8958 15838.306 0.4043927 0.6859426
CountryRomania 5931.5476 14541.765 0.4078974 0.6833679
CountryRussia 12386.0034 10259.910 1.2072234 0.2274077
CountrySaudi Arabia 20884.2801 14184.278 1.4723541 0.1409931
CountrySingapore 33555.9692 12901.838 2.6008673 0.0093284
CountrySouth Africa 7916.3527 11419.120 0.6932542 0.4881848
CountrySouth Korea 9116.4124 10497.266 0.8684559 0.3851897
CountrySpain 23098.4818 11100.611 2.0808298 0.0375042
CountrySri Lanka -12694.9592 16294.252 -0.7791066 0.4359567
CountryTaiwan 10427.4800 10693.585 0.9751154 0.3295539
CountryThailand 1394.5859 12229.749 0.1140323 0.9092172
CountryTunisia 7549.6580 13015.713 0.5800418 0.5619146
CountryTurkey 1616.1771 10331.055 0.1564387 0.8756940
CountryUkraine 1237.6520 13679.324 0.0904761 0.9279128
CountryUnited Arab Emirates 23132.3573 12760.481 1.8128123 0.0699254
CountryUnited Kingdom of Great Britain and Northern Ireland 51562.5763 11518.027 4.4766849 0.0000078
CountryUnited States of America 97751.2547 9303.987 10.5063830 0.0000000
CountryViet Nam -352.9408 11003.137 -0.0320764 0.9744125
CountryZimbabwe 511.2355 13919.532 0.0367279 0.9707035
Student 750.5338 2084.419 0.3600686 0.7188122
Years.Programming< 1 years -1719.6036 8647.572 -0.1988539 0.8423857
Years.Programming1-3 years 722.9183 8652.776 0.0835476 0.9334198
Years.Programming10-20 years 27996.9691 8893.703 3.1479541 0.0016546
Years.Programming20+ years 33746.7610 9112.792 3.7032298 0.0002153
Years.Programming3-5 years 4905.9652 8744.225 0.5610520 0.5747892
Years.Programming5-10 years 8447.6049 8785.765 0.9615105 0.3363457
Years.ProgrammingI have never written code -248.1520 8604.153 -0.0288410 0.9769927
Incorporate.Machine.LearningI do not know -3960.7968 2959.601 -1.3382872 0.1808684
Incorporate.Machine.LearningNo (we do not use ML methods) -5077.9425 2691.129 -1.8869188 0.0592336
Incorporate.Machine.LearningWe are exploring ML methods (and may one day put a model into production) 2358.5856 2851.009 0.8272811 0.4081203
Incorporate.Machine.LearningWe have well established ML methods (i.e., models in production for more than 2 years) 29841.0260 2994.593 9.9649682 0.0000000
Incorporate.Machine.LearningWe recently started using ML methods (i.e., models in production for less than 2 years) 6655.1265 3197.036 2.0816552 0.0374287
Incorporate.Machine.LearningWe use ML methods for generating insights (but do not put working models into production) 3676.2384 3521.212 1.0440264 0.2965276
ML.Hubs…Repositories.Used TensorFlow Hub 5550.5443 3420.097 1.6229199 0.1046745
ML.Hubs…Repositories.Used Huggingface Models 11069.2004 4461.641 2.4809709 0.0131377
ML.Hubs…Repositories.Used Jumpstart -14603.3567 28787.769 -0.5072764 0.6119850
ML.Hubs…Repositories.Used Kaggle datasets -949.7629 2512.312 -0.3780434 0.7054156
ML.Hubs…Repositories.Used NVIDIA NGC models -1940.7303 9985.857 -0.1943479 0.8459120
ML.Hubs…Repositories.Used ONNX models 6766.3049 12398.032 0.5457564 0.5852597
ML.Hubs…Repositories.Used PyTorch Hub -3242.6296 4623.816 -0.7012887 0.4831581
ML.Hubs…Repositories.Used Timm 11077.5492 8159.729 1.3575879 0.1746605
ML.Hubs…Repositories.UsedOther storage services (i.e. google drive) 2981.9750 9036.644 0.3299870 0.7414247
Highest.Level.of.Formal.EducationBachelor’s degree -704.0327 9083.008 -0.0775110 0.9382204
Highest.Level.of.Formal.EducationDoctoral degree -1337.4739 9255.672 -0.1445032 0.8851094
Highest.Level.of.Formal.EducationI prefer not to answer 1896.7834 9359.414 0.2026605 0.8394093
Highest.Level.of.Formal.EducationMaster’s degree -2507.5674 9042.283 -0.2773158 0.7815500
Highest.Level.of.Formal.EducationNo formal education past high school 1095.0893 9723.973 0.1126175 0.9103387
Highest.Level.of.Formal.EducationProfessional doctorate 1815.7385 9750.094 0.1862278 0.8522743
Highest.Level.of.Formal.EducationSome college/university study without earning a bachelor’s degree -7060.9140 9367.752 -0.7537469 0.4510395
Helpful.University -981.1163 1526.991 -0.6425161 0.5205699
Helpful.Online.Courses 177.7580 1393.003 0.1276077 0.8984650
Helpful.Social.Media 1782.5842 1849.385 0.9638796 0.3351565
Helpful.Video.Platform -1476.7613 1305.861 -1.1308714 0.2581677
Helpful.Kaggle -1928.5795 1383.019 -1.3944711 0.1632422
Helpful.None 786.6436 3426.703 0.2295628 0.8184416
Media.on.Social.Twitter 3385.9781 1752.493 1.9320927 0.0534089
Media.on.Social.Email.Newsletters 232.0917 1737.398 0.1335858 0.8937359
Media.on.Reddit 196.0316 2042.547 0.0959741 0.9235453
Media.on.Kaggle 1302.6609 1438.275 0.9057109 0.3651360
Media.on.Course.Forums -2217.9694 1723.620 -1.2868086 0.1982252
Media.on.Youtube -2567.4087 1381.660 -1.8582064 0.0632030
Media.on.Podcasts 2010.0364 2227.739 0.9022763 0.3669570
Media.on.Blogs 333.7197 1431.588 0.2331116 0.8156850
Media.on.Journal.Publications -399.4962 1823.090 -0.2191313 0.8265574
Media.on.Slack.Communities -1167.1066 2352.511 -0.4961110 0.6198396
No.Media.Sources -3509.0622 2905.592 -1.2076926 0.2272271
Data.Science.on.Coursera -456.0458 1435.822 -0.3176200 0.7507875
Data.Science.on.edX 768.2341 2131.088 0.3604892 0.7184977
Data.Science.on.Kaggle.Learn.Courses -1021.5180 1482.574 -0.6890166 0.4908473
Data.Science.on.DataCamp -1243.9259 1812.741 -0.6862126 0.4926134
Data.Science.on.Fast.ai 923.0810 3275.566 0.2818081 0.7781033
Data.Science.on.Udacity 1051.6840 2231.709 0.4712461 0.6374872
Data.Science.on.Udemy -882.2995 1457.959 -0.6051607 0.5451019
Data.Science.on.LinkedIn.Learning -113.6682 1993.927 -0.0570072 0.9545419
Cloud.certification.programs 1597.7881 2311.804 0.6911435 0.4895099
Data.Science.University.Courses 3510.0109 1588.833 2.2091751 0.0272112
No.Data.Science.Courses 123.2877 2204.012 0.0559379 0.9553937
Python 1857.2647 2117.445 0.8771254 0.3804639
R 706.2501 1637.289 0.4313535 0.6662314
SQL 989.1151 1371.301 0.7212968 0.4707633
C 1512.9581 1874.749 0.8070190 0.4196969
C. -1880.7577 2539.441 -0.7406188 0.4589621
C.. 2090.7774 1779.804 1.1747233 0.2401657
Java -1298.6844 1761.031 -0.7374567 0.4608820
Javascript -1436.8006 1886.210 -0.7617393 0.4462543
Bash 2284.5738 2515.254 0.9082874 0.3637736
PHP -1125.5821 2720.452 -0.4137482 0.6790777
MATLAB 1830.8836 2055.728 0.8906254 0.3731764
Julia 1779.4135 6003.624 0.2963899 0.7669456
Go 6962.7998 5258.782 1.3240329 0.1855574
No.Programming.Languages -3075.5039 6153.472 -0.4997997 0.6172398

r^2 values are very close to one another, good model

9 model play-around

kaggleCon <- read.csv("kaggleContinuous.csv")

#make sure its a factor
kaggleCon <- kaggleCon %>%
  mutate(across(c(2:4, 6:14), as.factor))

#filter out the outliers
#selected_contries <- c("United States of America", "Australia", "France", "Canada", "Germany", "Ireland", "Italy", "India", "Japan", "Portugal", "South Korea", "Spain", "Hong Kong (S.A.R.)", "United Arab Emirates","United Kingdom of Great Britain and Northern Ireland" )

kaggleCon <- kaggleCon %>% 
  filter(Compensation <= 500000) %>% 
  filter(!Gender == "Prefer to self-describe")

#kaggleCon <- kaggleCon[kaggleCon$Country %in% selected_contries, ]


#deselect inflated variables from dataset
kaggleCon <- kaggleCon %>% 
  select(-c(Published.Academic.Research.Papers, How.many.individuals.are.responsible, Company.Size,Years.Used.Machine.Learning, Industry.of.Work))

#model
lm <- lm(Compensation~.,data = kaggleCon)

summary(lm)

vif(lm)
predict(lm, newdata = kaggleCon)

10 Limitations

  1. I wanted to get rid of demographic bias from intercept, but time constraints